library(tidyverse)
library(corrplot)
library(leaps)
library(performance)
library(MASS)
library(caret)

knitr::opts_chunk$set(
  fig.width = 8,
  fig.asp = .6,
  out.width = "90%",
  echo = TRUE, warning = FALSE, dpi=300
  
)

Step 1: Data Preprocessing

After importing the csv file containing the County Demographic Information (CDI) data, we notice that crimes, physicians, and hospital beds are given as numbers, while other info are given as proportions. We therefore compute the number of crimes, physicians, and hospital beds per 1000 people.

cdi_data = read_csv("./data/cdi.csv") %>%
  janitor::clean_names() %>%
  mutate(
    cty_state = str_c(cty,",",state),
    docs_rate_1000 = 1000 * docs/pop, 
    # Compute number of doctors/hospital beds per 1000 people.
    beds_rate_1000 = 1000 * beds/pop,
    density = as.numeric(pop)/as.numeric(area),
    crime_rate_1000 = 1000 * crimes/pop) %>% 
  # Compute number of crimes per 1000 people. 
  dplyr::select(-docs,-beds,-crimes) %>%
  relocate(id,cty_state,cty)

#knitr::kable(head(cdi_data))

Step 2 - Exploratory Analysis

We then take a closer look of each variables, calculate the pairwise correlations between variables, and list all the correlations between the crime rate (our interest) and all other variables.

cdi_data_exp = cdi_data %>%
  dplyr::select(-id,-cty,-state, -cty_state) 
par(mfrow=c(4,3))
boxplot(cdi_data_exp$area,main="Area")
boxplot(cdi_data_exp$pop,main="Population")
boxplot(cdi_data_exp$pop18,main="Population 18-34")
boxplot(cdi_data_exp$pop65,main="Population 65+")
boxplot(cdi_data_exp$hsgrad,main="Highschool grads")
boxplot(cdi_data_exp$bagrad,main="Bachelor's grads")

#par(mfrow=c(2,3))
boxplot(cdi_data_exp$poverty,main="Poverty Rate")
boxplot(cdi_data_exp$unemp,main="Unemployment Rate")
boxplot(cdi_data_exp$pcincome,main="Income Per Capita")
boxplot(cdi_data_exp$totalinc,main="Income Total")
boxplot(cdi_data_exp$docs_rate_1000,main="Active Physicians")
boxplot(cdi_data_exp$beds_rate_1000,main="Hospital Beds")
\label{fig:figs}boxplot of continuous variables distribution

boxplot of continuous variables distribution

par(mfrow=c(1,1))

ggplot(cdi_data,aes(region)) + 
  geom_histogram(binwidth = 0.5) +
  theme_classic() +
  xlab("Region")+
  ylab("Count") +
  labs(title = "Histogram: Counts of four regions")
\label{fig:figs}Histogram of catagorical variable:region distribution

Histogram of catagorical variable:region distribution

boxplot(cdi_data_exp$crime_rate_1000,main="Boxplot of Crime Rate",horizontal = TRUE)
\label{fig:figs}boxplot of dependent variable: crime rate

boxplot of dependent variable: crime rate

# data exploratory
# pairs(cdi_data_exp)
# correlation plot
cdi_data_cor = cor(cdi_data_exp)
corrplot(cdi_data_cor, type = "upper", diag = FALSE, title = "Correlation heatmap")
\label{fig:figs}Correlation heatmap

Correlation heatmap

crime_1000_cor = data.frame(cdi_data_cor) %>% 
  dplyr::select("Crime Rate (Per 1000)" = crime_rate_1000) %>% 
  t()

#knitr::kable(crime_1000_cor,digits = 2) 

Training/Test set split

cdi_data = cdi_data %>% 
  dplyr::select(-id,-cty_state, -cty,-state) %>% 
  mutate(region = factor(region))

set.seed(1)
dt = sort(sample(nrow(cdi_data), nrow(cdi_data)*.9))
train_data = cdi_data[dt,]
test_data = cdi_data[-dt,]

Remove outliers and high leverage point

# Remove high leverage points

cdi_data_clean = train_data[train_data$area >= quantile(train_data$area,0.002) & train_data$area <= quantile(train_data$area,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$pop >= quantile(cdi_data_clean$pop,0.002) & cdi_data_clean$pop <= quantile(cdi_data_clean$pop,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$pop18 >= quantile(cdi_data_clean$pop18,0.002) & cdi_data_clean$pop18 <= quantile(cdi_data_clean$pop18,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$pop65 >= quantile(cdi_data_clean$pop65,0.002) & cdi_data_clean$pop65 <= quantile(cdi_data_clean$pop65,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$hsgrad >= quantile(cdi_data_clean$hsgrad,0.002) & cdi_data_clean$hsgrad <= quantile(cdi_data_clean$hsgrad,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$bagrad >= quantile(cdi_data_clean$bagrad,0.002) & cdi_data_clean$bagrad <= quantile(cdi_data_clean$bagrad,0.998),]

cdi_data_clean = cdi_data_clean[cdi_data_clean$poverty >= quantile(cdi_data_clean$poverty,0.002) & cdi_data_clean$poverty <= quantile(cdi_data_clean$poverty,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$unemp >= quantile(cdi_data_clean$unemp,0.002) & cdi_data_clean$unemp <= quantile(cdi_data_clean$unemp,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$pcincome >= quantile(cdi_data_clean$pcincome,0.002) & cdi_data_clean$pcincome <= quantile(cdi_data_clean$pcincome,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$totalinc >= quantile(cdi_data_clean$totalinc,0.002) & cdi_data_clean$totalinc <= quantile(cdi_data_clean$totalinc,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$docs_rate_1000 >= quantile(cdi_data_clean$docs_rate_1000,0.002) & cdi_data_clean$docs_rate_1000 <= quantile(cdi_data_clean$docs_rate_1000,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$beds_rate_1000 >= quantile(cdi_data_clean$beds_rate_1000,0.002) & cdi_data_clean$beds_rate_1000 <= quantile(cdi_data_clean$beds_rate_1000,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$beds_rate_1000 >= quantile(cdi_data_clean$beds_rate_1000,0.002) & cdi_data_clean$beds_rate_1000 <= quantile(cdi_data_clean$beds_rate_1000,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$density >= quantile(cdi_data_clean$density,0.002) & cdi_data_clean$density <= quantile(cdi_data_clean$density,0.998),]

cdi_data_clean = cdi_data_clean[cdi_data_clean$crime_rate_1000 >= quantile(cdi_data_clean$crime_rate_1000,0.002) & cdi_data_clean$beds_rate_1000 <= quantile(cdi_data_clean$crime_rate_1000,0.998),]
par(mfrow=c(4,3))
boxplot(cdi_data_clean$area,main="Area")
boxplot(cdi_data_clean$pop,main="Population")
boxplot(cdi_data_clean$pop18,main="Population 18-34")
boxplot(cdi_data_clean$pop65,main="Population 65+")
boxplot(cdi_data_clean$hsgrad,main="Highschool grads")
boxplot(cdi_data_clean$bagrad,main="Bachelor's grads")

boxplot(cdi_data_clean$poverty,main="Poverty Rate")
boxplot(cdi_data_clean$unemp,main="Unemployment Rate")
boxplot(cdi_data_clean$pcincome,main="Income Per Capita")
boxplot(cdi_data_clean$totalinc,main="Income Total")
boxplot(cdi_data_clean$docs_rate_1000,main="Active Physicians")
boxplot(cdi_data_clean$beds_rate_1000,main="Hospital Beds")
\label{fig:figs}Boxplot of each continuous variables aftern cleaning outliers

Boxplot of each continuous variables aftern cleaning outliers

Model construction

Data used for building model:

cdi_model = cdi_data_clean

Stepwise regression

full.fit = lm(crime_rate_1000 ~ ., data = cdi_model)
summary(full.fit) %>% 
  broom::tidy() %>%
  mutate(p_rank = rank(p.value))
## # A tibble: 17 × 6
##    term               estimate  std.error statistic  p.value p_rank
##    <chr>                 <dbl>      <dbl>     <dbl>    <dbl>  <dbl>
##  1 (Intercept)    -108.        29.0         -3.71   2.40e- 4      8
##  2 area             -0.000699   0.000955    -0.732  4.65e- 1     16
##  3 pop               0.0000806  0.0000136    5.95   6.57e- 9      3
##  4 pop18             1.28       0.371        3.45   6.29e- 4      9
##  5 pop65            -0.0161     0.324       -0.0497 9.60e- 1     17
##  6 hsgrad            0.349      0.279        1.25   2.12e- 1     13
##  7 bagrad           -0.694      0.330       -2.10   3.63e- 2     11
##  8 poverty           1.88       0.432        4.34   1.86e- 5      6
##  9 unemp             0.885      0.548        1.61   1.07e- 1     12
## 10 pcincome          0.00325    0.000620     5.25   2.63e- 7      4
## 11 totalinc         -0.00341    0.000658    -5.19   3.52e- 7      5
## 12 region2          11.2        2.77         4.04   6.67e- 5      7
## 13 region3          29.8        2.70        11.1    1.47e-24      1
## 14 region4          24.3        3.60         6.75   6.31e-11      2
## 15 docs_rate_1000    1.30       1.26         1.03   3.03e- 1     14
## 16 beds_rate_1000    2.54       0.901        2.82   5.13e- 3     10
## 17 density           0.000701   0.000738     0.950  3.43e- 1     15
backward = step(full.fit, direction='backward') %>%  broom::tidy() %>%  rename(backward = "term")
## Start:  AIC=2062.47
## crime_rate_1000 ~ area + pop + pop18 + pop65 + hsgrad + bagrad + 
##     poverty + unemp + pcincome + totalinc + region + docs_rate_1000 + 
##     beds_rate_1000 + density
## 
##                  Df Sum of Sq    RSS    AIC
## - pop65           1         1  92276 2060.5
## - area            1       141  92416 2061.0
## - density         1       238  92513 2061.4
## - docs_rate_1000  1       280  92555 2061.6
## - hsgrad          1       411  92687 2062.1
## <none>                         92275 2062.5
## - unemp           1       687  92963 2063.2
## - bagrad          1      1164  93439 2065.1
## - beds_rate_1000  1      2092  94367 2068.7
## - pop18           1      3138  95413 2072.7
## - poverty         1      4967  97242 2079.7
## - totalinc        1      7109  99384 2087.7
## - pcincome        1      7269  99545 2088.3
## - pop             1      9328 101603 2095.8
## - region          3     35147 127422 2174.9
## 
## Step:  AIC=2060.47
## crime_rate_1000 ~ area + pop + pop18 + hsgrad + bagrad + poverty + 
##     unemp + pcincome + totalinc + region + docs_rate_1000 + beds_rate_1000 + 
##     density
## 
##                  Df Sum of Sq    RSS    AIC
## - area            1       144  92420 2059.1
## - density         1       238  92513 2059.4
## - docs_rate_1000  1       279  92555 2059.6
## - hsgrad          1       413  92689 2060.1
## <none>                         92276 2060.5
## - unemp           1       698  92974 2061.2
## - bagrad          1      1163  93439 2063.1
## - beds_rate_1000  1      2217  94493 2067.2
## - pop18           1      4127  96403 2074.5
## - poverty         1      5160  97436 2078.4
## - totalinc        1      7152  99428 2085.9
## - pcincome        1      7324  99600 2086.5
## - pop             1      9371 101646 2094.0
## - region          3     35176 127451 2173.0
## 
## Step:  AIC=2059.05
## crime_rate_1000 ~ pop + pop18 + hsgrad + bagrad + poverty + unemp + 
##     pcincome + totalinc + region + docs_rate_1000 + beds_rate_1000 + 
##     density
## 
##                  Df Sum of Sq    RSS    AIC
## - docs_rate_1000  1       282  92702 2058.2
## - density         1       397  92817 2058.6
## - hsgrad          1       474  92894 2058.9
## <none>                         92420 2059.1
## - unemp           1       626  93046 2059.5
## - bagrad          1      1195  93615 2061.8
## - beds_rate_1000  1      2220  94640 2065.8
## - pop18           1      4080  96500 2072.9
## - poverty         1      5090  97510 2076.7
## - totalinc        1      7017  99437 2083.9
## - pcincome        1      7263  99683 2084.8
## - pop             1      9229 101649 2092.0
## - region          3     35031 127452 2171.0
## 
## Step:  AIC=2058.16
## crime_rate_1000 ~ pop + pop18 + hsgrad + bagrad + poverty + unemp + 
##     pcincome + totalinc + region + beds_rate_1000 + density
## 
##                  Df Sum of Sq    RSS    AIC
## - hsgrad          1       394  93095 2057.7
## <none>                         92702 2058.2
## - density         1       580  93282 2058.4
## - unemp           1       637  93339 2058.7
## - bagrad          1       951  93653 2059.9
## - pop18           1      4264  96966 2072.7
## - poverty         1      5090  97792 2075.8
## - beds_rate_1000  1      5318  98020 2076.6
## - totalinc        1      6972  99674 2082.8
## - pcincome        1      7672 100373 2085.3
## - pop             1      9205 101907 2090.9
## - region          3     35510 128212 2171.2
## 
## Step:  AIC=2057.72
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + unemp + pcincome + 
##     totalinc + region + beds_rate_1000 + density
## 
##                  Df Sum of Sq    RSS    AIC
## - density         1       435  93531 2057.4
## <none>                         93095 2057.7
## - unemp           1       522  93618 2057.8
## - bagrad          1       561  93656 2057.9
## - pop18           1      3989  97084 2071.1
## - poverty         1      4766  97861 2074.0
## - beds_rate_1000  1      5331  98426 2076.2
## - totalinc        1      7222 100317 2083.1
## - pcincome        1      7324 100420 2083.5
## - pop             1      9501 102597 2091.4
## - region          3     35119 128215 2169.2
## 
## Step:  AIC=2057.43
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + unemp + pcincome + 
##     totalinc + region + beds_rate_1000
## 
##                  Df Sum of Sq    RSS    AIC
## - unemp           1       504  94035 2057.4
## <none>                         93531 2057.4
## - bagrad          1       709  94239 2058.2
## - pop18           1      4686  98217 2073.4
## - poverty         1      5568  99099 2076.7
## - beds_rate_1000  1      5625  99156 2076.9
## - totalinc        1      7389 100919 2083.3
## - pcincome        1      8910 102440 2088.8
## - pop             1      9968 103498 2092.6
## - region          3     34810 128341 2167.6
## 
## Step:  AIC=2057.4
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + pcincome + 
##     totalinc + region + beds_rate_1000
## 
##                  Df Sum of Sq    RSS    AIC
## <none>                         94035 2057.4
## - bagrad          1      1394  95428 2060.8
## - pop18           1      4681  98715 2073.2
## - beds_rate_1000  1      5122  99156 2074.9
## - totalinc        1      7551 101586 2083.8
## - poverty         1      8455 102489 2087.0
## - pcincome        1     10060 104095 2092.7
## - pop             1     10133 104167 2093.0
## - region          3     35812 129846 2169.8
both = step(full.fit, direction = "both") %>% broom::tidy() %>% rename(stepwise = "term")
## Start:  AIC=2062.47
## crime_rate_1000 ~ area + pop + pop18 + pop65 + hsgrad + bagrad + 
##     poverty + unemp + pcincome + totalinc + region + docs_rate_1000 + 
##     beds_rate_1000 + density
## 
##                  Df Sum of Sq    RSS    AIC
## - pop65           1         1  92276 2060.5
## - area            1       141  92416 2061.0
## - density         1       238  92513 2061.4
## - docs_rate_1000  1       280  92555 2061.6
## - hsgrad          1       411  92687 2062.1
## <none>                         92275 2062.5
## - unemp           1       687  92963 2063.2
## - bagrad          1      1164  93439 2065.1
## - beds_rate_1000  1      2092  94367 2068.7
## - pop18           1      3138  95413 2072.7
## - poverty         1      4967  97242 2079.7
## - totalinc        1      7109  99384 2087.7
## - pcincome        1      7269  99545 2088.3
## - pop             1      9328 101603 2095.8
## - region          3     35147 127422 2174.9
## 
## Step:  AIC=2060.47
## crime_rate_1000 ~ area + pop + pop18 + hsgrad + bagrad + poverty + 
##     unemp + pcincome + totalinc + region + docs_rate_1000 + beds_rate_1000 + 
##     density
## 
##                  Df Sum of Sq    RSS    AIC
## - area            1       144  92420 2059.1
## - density         1       238  92513 2059.4
## - docs_rate_1000  1       279  92555 2059.6
## - hsgrad          1       413  92689 2060.1
## <none>                         92276 2060.5
## - unemp           1       698  92974 2061.2
## + pop65           1         1  92275 2062.5
## - bagrad          1      1163  93439 2063.1
## - beds_rate_1000  1      2217  94493 2067.2
## - pop18           1      4127  96403 2074.5
## - poverty         1      5160  97436 2078.4
## - totalinc        1      7152  99428 2085.9
## - pcincome        1      7324  99600 2086.5
## - pop             1      9371 101646 2094.0
## - region          3     35176 127451 2173.0
## 
## Step:  AIC=2059.05
## crime_rate_1000 ~ pop + pop18 + hsgrad + bagrad + poverty + unemp + 
##     pcincome + totalinc + region + docs_rate_1000 + beds_rate_1000 + 
##     density
## 
##                  Df Sum of Sq    RSS    AIC
## - docs_rate_1000  1       282  92702 2058.2
## - density         1       397  92817 2058.6
## - hsgrad          1       474  92894 2058.9
## <none>                         92420 2059.1
## - unemp           1       626  93046 2059.5
## + area            1       144  92276 2060.5
## + pop65           1         4  92416 2061.0
## - bagrad          1      1195  93615 2061.8
## - beds_rate_1000  1      2220  94640 2065.8
## - pop18           1      4080  96500 2072.9
## - poverty         1      5090  97510 2076.7
## - totalinc        1      7017  99437 2083.9
## - pcincome        1      7263  99683 2084.8
## - pop             1      9229 101649 2092.0
## - region          3     35031 127452 2171.0
## 
## Step:  AIC=2058.16
## crime_rate_1000 ~ pop + pop18 + hsgrad + bagrad + poverty + unemp + 
##     pcincome + totalinc + region + beds_rate_1000 + density
## 
##                  Df Sum of Sq    RSS    AIC
## - hsgrad          1       394  93095 2057.7
## <none>                         92702 2058.2
## - density         1       580  93282 2058.4
## - unemp           1       637  93339 2058.7
## + docs_rate_1000  1       282  92420 2059.1
## + area            1       147  92555 2059.6
## - bagrad          1       951  93653 2059.9
## + pop65           1         1  92701 2060.2
## - pop18           1      4264  96966 2072.7
## - poverty         1      5090  97792 2075.8
## - beds_rate_1000  1      5318  98020 2076.6
## - totalinc        1      6972  99674 2082.8
## - pcincome        1      7672 100373 2085.3
## - pop             1      9205 101907 2090.9
## - region          3     35510 128212 2171.2
## 
## Step:  AIC=2057.72
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + unemp + pcincome + 
##     totalinc + region + beds_rate_1000 + density
## 
##                  Df Sum of Sq    RSS    AIC
## - density         1       435  93531 2057.4
## <none>                         93095 2057.7
## - unemp           1       522  93618 2057.8
## - bagrad          1       561  93656 2057.9
## + hsgrad          1       394  92702 2058.2
## + area            1       202  92894 2058.9
## + docs_rate_1000  1       201  92894 2058.9
## + pop65           1         4  93091 2059.7
## - pop18           1      3989  97084 2071.1
## - poverty         1      4766  97861 2074.0
## - beds_rate_1000  1      5331  98426 2076.2
## - totalinc        1      7222 100317 2083.1
## - pcincome        1      7324 100420 2083.5
## - pop             1      9501 102597 2091.4
## - region          3     35119 128215 2169.2
## 
## Step:  AIC=2057.43
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + unemp + pcincome + 
##     totalinc + region + beds_rate_1000
## 
##                  Df Sum of Sq    RSS    AIC
## - unemp           1       504  94035 2057.4
## <none>                         93531 2057.4
## + density         1       435  93095 2057.7
## + area            1       388  93143 2057.9
## + docs_rate_1000  1       351  93180 2058.1
## - bagrad          1       709  94239 2058.2
## + hsgrad          1       249  93282 2058.4
## + pop65           1         0  93530 2059.4
## - pop18           1      4686  98217 2073.4
## - poverty         1      5568  99099 2076.7
## - beds_rate_1000  1      5625  99156 2076.9
## - totalinc        1      7389 100919 2083.3
## - pcincome        1      8910 102440 2088.8
## - pop             1      9968 103498 2092.6
## - region          3     34810 128341 2167.6
## 
## Step:  AIC=2057.4
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + pcincome + 
##     totalinc + region + beds_rate_1000
## 
##                  Df Sum of Sq    RSS    AIC
## <none>                         94035 2057.4
## + unemp           1       504  93531 2057.4
## + density         1       417  93618 2057.8
## + docs_rate_1000  1       371  93664 2057.9
## + area            1       260  93774 2058.4
## + hsgrad          1       165  93870 2058.8
## + pop65           1        12  94023 2059.4
## - bagrad          1      1394  95428 2060.8
## - pop18           1      4681  98715 2073.2
## - beds_rate_1000  1      5122  99156 2074.9
## - totalinc        1      7551 101586 2083.8
## - poverty         1      8455 102489 2087.0
## - pcincome        1     10060 104095 2092.7
## - pop             1     10133 104167 2093.0
## - region          3     35812 129846 2169.8

Variables chosen from stepwise regression:

bind_cols(backward[-1,1],both[-1,1]) %>% knitr::kable(caption = "Vairable selected from stepwise regression")
Vairable selected from stepwise regression
backward stepwise
pop pop
pop18 pop18
bagrad bagrad
poverty poverty
pcincome pcincome
totalinc totalinc
region2 region2
region3 region3
region4 region4
beds_rate_1000 beds_rate_1000

Criteria based selection

sb = regsubsets(crime_rate_1000 ~ ., data = cdi_model, nvmax = 14)
sumsb = summary(sb) # pop pop18 hsgrad bagrad poverty pcincome totalinc region beds_rate_1000 density
coef(sb, id = 12)
##    (Intercept)            pop          pop18         bagrad        poverty 
##  -8.317643e+01   8.025821e-05   1.249522e+00  -3.679141e-01   1.664217e+00 
##          unemp       pcincome       totalinc        region2        region3 
##   7.482408e-01   3.197803e-03  -3.406110e-03   1.203348e+01   2.967462e+01 
##        region4 beds_rate_1000        density 
##   2.462527e+01   3.087835e+00   8.669407e-04
par(mfrow=c(1,2))
plot(2:15, sumsb$cp, xlab="No. of parameters", ylab="Cp Statistic") 
abline(0,1)

plot(2:15, sumsb$adjr2, xlab="No of parameters", ylab="Adj R2")
\label{fig:figs}Subset selection for best parameter numbers

Subset selection for best parameter numbers

According to the output, we determine that the number of variables should be above 12 because \(C_p \leq p\). Based on this analysis, we find that unemp could also be selected.

Discussion

We need to remove totalinc, because it can be replaced. totalinc = pcincome * pop.

Model building from the vairables we selected

fit_nest = lm(crime_rate_1000 ~  
                  pop + pop18 + bagrad +
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000 + density, data = cdi_model)
summary(fit_nest)
## 
## Call:
## lm(formula = crime_rate_1000 ~ pop + pop18 + bagrad + poverty + 
##     unemp + pcincome + pcincome * pop + region + beds_rate_1000 + 
##     density, data = cdi_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.123  -9.614  -1.011   8.440  57.956 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -8.318e+01  1.549e+01  -5.371 1.42e-07 ***
## pop             8.026e-05  1.335e-05   6.011 4.59e-09 ***
## pop18           1.250e+00  3.208e-01   3.894 0.000118 ***
## bagrad         -3.679e-01  2.519e-01  -1.461 0.145035    
## poverty         1.664e+00  3.909e-01   4.257 2.66e-05 ***
## unemp           7.482e-01  5.310e-01   1.409 0.159717    
## pcincome        3.198e-03  6.059e-04   5.277 2.29e-07 ***
## region2         1.203e+01  2.618e+00   4.597 5.98e-06 ***
## region3         2.967e+01  2.682e+00  11.065  < 2e-16 ***
## region4         2.463e+01  3.128e+00   7.873 4.27e-14 ***
## beds_rate_1000  3.088e+00  6.859e-01   4.502 9.14e-06 ***
## density         8.670e-04  6.738e-04   1.287 0.199066    
## pop:pcincome   -3.406e-09  6.500e-10  -5.240 2.76e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.22 on 354 degrees of freedom
## Multiple R-squared:  0.563,  Adjusted R-squared:  0.5482 
## F-statistic: 38.01 on 12 and 354 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit_nest)
\label{fig:figs}Diagnose plots of model without interaction terms

Diagnose plots of model without interaction terms

boxcox(fit_nest)
\label{fig:figs}Boxcox plot of model without interaction terms

Boxcox plot of model without interaction terms

The peak of boxcox plot is close to around 0.5~1. Try \(\sqrt{y}\) transformation

transformation

cdi_model_trans = cdi_model %>% 
  mutate(
    y_sqrt = sqrt(crime_rate_1000)
  )

fit_nest_trans = lm(y_sqrt ~  
                   pop + pop18 + bagrad +
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000 + density, data = cdi_model_trans)
summary(fit_nest_trans)
## 
## Call:
## lm(formula = y_sqrt ~ pop + pop18 + bagrad + poverty + unemp + 
##     pcincome + pcincome * pop + region + beds_rate_1000 + density, 
##     data = cdi_model_trans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0165 -0.6220 -0.0030  0.6178  3.4444 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -1.813e+00  1.060e+00  -1.710 0.088124 .  
## pop             5.046e-06  9.142e-07   5.520 6.56e-08 ***
## pop18           8.223e-02  2.197e-02   3.744 0.000212 ***
## bagrad         -2.693e-02  1.725e-02  -1.562 0.119298    
## poverty         9.395e-02  2.677e-02   3.510 0.000506 ***
## unemp           5.976e-02  3.636e-02   1.644 0.101141    
## pcincome        2.049e-04  4.149e-05   4.938 1.22e-06 ***
## region2         8.599e-01  1.792e-01   4.798 2.37e-06 ***
## region3         2.098e+00  1.836e-01  11.427  < 2e-16 ***
## region4         1.863e+00  2.141e-01   8.700  < 2e-16 ***
## beds_rate_1000  2.227e-01  4.696e-02   4.742 3.07e-06 ***
## density         5.455e-05  4.613e-05   1.182 0.237840    
## pop:pcincome   -2.115e-10  4.450e-11  -4.753 2.93e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.11 on 354 degrees of freedom
## Multiple R-squared:  0.5515, Adjusted R-squared:  0.5363 
## F-statistic: 36.28 on 12 and 354 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit_nest_trans)
\label{fig:figs}Diagnose plots of model without interaction terms

Diagnose plots of model without interaction terms

Compare to the diagnose plots of untransformed model, we found that the residuals are more unevenly distributed. Therefore, transformed model is worse. We select the untransformed model.

Our first model: \[crime\_rate\_1000 = pop + pop18 + bagrad + poverty + unemp \\ + pcincome + pcincome*pop + region beds\_rate\_1000 + density\]

Add Interaction term: poverty+income

According to Census Bureau, the number of persons below the official government poverty level was 33.6 million in 1990, representing 13.5 percent of the Nation’s population. Thus, we can use this criteria to divide poverty into two category: higher than national poverty rate and lower than national poverty rate.

poverty_status = cdi_model %>% 
  mutate(national_poverty = if_else(poverty > 13.5, "higher", "lower"))

ggplot(poverty_status, aes(x = pcincome, y = crime_rate_1000, color = national_poverty)) + 
  geom_point(alpha = .5) +
  geom_smooth(method = "lm", se = F, aes(group = national_poverty, color = national_poverty)) +
  labs(
    title = "Crime Rate and Per Capita Income by Poverty Status",
    x = "Per Capita Income",
    y = "Crime Rate ",
    color = "Comparison with national avergae"
  )
\label{fig:figs}Interaction plot of Income Per Capita and Poverty

Interaction plot of Income Per Capita and Poverty

fit_int1 = lm(crime_rate_1000 ~  
                   pop + pop18 + bagrad +
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000 + density +
                  poverty*pcincome, data = cdi_model)
summary(fit_int1) %>% broom::tidy()
## # A tibble: 14 × 5
##    term             estimate std.error statistic  p.value
##    <chr>               <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)      -4.88e+1  1.75e+ 1    -2.80  5.46e- 3
##  2 pop               6.17e-5  1.39e- 5     4.44  1.19e- 5
##  3 pop18             1.14e+0  3.16e- 1     3.60  3.57e- 4
##  4 bagrad           -2.62e-1  2.48e- 1    -1.05  2.93e- 1
##  5 poverty          -2.54e+0  1.12e+ 0    -2.26  2.46e- 2
##  6 unemp             7.42e-1  5.20e- 1     1.43  1.55e- 1
##  7 pcincome          1.42e-3  7.43e- 4     1.91  5.72e- 2
##  8 region2           1.07e+1  2.59e+ 0     4.15  4.20e- 5
##  9 region3           2.80e+1  2.66e+ 0    10.5   1.19e-22
## 10 region4           2.22e+1  3.13e+ 0     7.08  7.60e-12
## 11 beds_rate_1000    2.01e+0  7.25e- 1     2.77  5.91e- 3
## 12 density           2.01e-4  6.81e- 4     0.295 7.68e- 1
## 13 pop:pcincome     -2.56e-9  6.71e-10    -3.82  1.59e- 4
## 14 poverty:pcincome  2.80e-4  7.05e- 5     3.98  8.52e- 5
check_collinearity(fit_int1)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##              Term  VIF Increased SE Tolerance
##               pop 1.00         1.00      1.00
##             pop18 2.10         1.45      0.48
##            bagrad 2.61         1.62      0.38
##           poverty 1.18         1.09      0.85
##             unemp 1.69         1.30      0.59
##          pcincome 1.12         1.06      0.89
##            region 1.59         1.26      0.63
##    beds_rate_1000 1.38         1.18      0.72
##           density 1.01         1.01      0.99
##      pop:pcincome 1.00         1.00      1.00
##  poverty:pcincome 1.00         1.00      1.00

We notice that density, bagrad are not significant

# remove density
fit_int1 = lm(crime_rate_1000 ~  
                   pop + pop18 + bagrad +
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000  +
                  poverty*pcincome, data = cdi_model)
summary(fit_int1)
## 
## Call:
## lm(formula = crime_rate_1000 ~ pop + pop18 + bagrad + poverty + 
##     unemp + pcincome + pcincome * pop + region + beds_rate_1000 + 
##     poverty * pcincome, data = cdi_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.560  -8.893  -0.811   8.674  64.139 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.943e+01  1.731e+01  -2.855 0.004553 ** 
## pop               6.171e-05  1.387e-05   4.448 1.16e-05 ***
## pop18             1.153e+00  3.109e-01   3.708 0.000242 ***
## bagrad           -2.687e-01  2.467e-01  -1.089 0.276790    
## poverty          -2.594e+00  1.107e+00  -2.342 0.019719 *  
## unemp             7.393e-01  5.195e-01   1.423 0.155595    
## pcincome          1.431e-03  7.414e-04   1.930 0.054403 .  
## region2           1.067e+01  2.575e+00   4.143 4.29e-05 ***
## region3           2.790e+01  2.647e+00  10.539  < 2e-16 ***
## region4           2.207e+01  3.110e+00   7.096 7.03e-12 ***
## beds_rate_1000    2.004e+00  7.238e-01   2.768 0.005932 ** 
## pop:pcincome     -2.555e-09  6.699e-10  -3.814 0.000161 ***
## poverty:pcincome  2.856e-04  6.829e-05   4.182 3.65e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.87 on 354 degrees of freedom
## Multiple R-squared:  0.5816, Adjusted R-squared:  0.5674 
## F-statistic: 41.01 on 12 and 354 DF,  p-value: < 2.2e-16
check_collinearity(fit_int1)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##              Term  VIF Increased SE Tolerance
##               pop 1.00         1.00      1.00
##             pop18 2.08         1.44      0.48
##            bagrad 2.59         1.61      0.39
##           poverty 1.18         1.09      0.85
##             unemp 1.69         1.30      0.59
##          pcincome 1.13         1.06      0.89
##            region 1.59         1.26      0.63
##    beds_rate_1000 1.39         1.18      0.72
##      pop:pcincome 1.00         1.00      1.00
##  poverty:pcincome 1.00         1.00      1.00
# remove bagrad
fit_int1 = lm(crime_rate_1000 ~  
                   pop + pop18 + 
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000  +
                  poverty*pcincome, data = cdi_model)
summary(fit_int1)
## 
## Call:
## lm(formula = crime_rate_1000 ~ pop + pop18 + poverty + unemp + 
##     pcincome + pcincome * pop + region + beds_rate_1000 + poverty * 
##     pcincome, data = cdi_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.933  -8.984  -0.825   9.062  64.974 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.143e+01  1.568e+01  -2.642 0.008603 ** 
## pop               6.001e-05  1.379e-05   4.352 1.76e-05 ***
## pop18             9.277e-01  2.321e-01   3.996 7.84e-05 ***
## poverty          -2.777e+00  1.095e+00  -2.536 0.011647 *  
## unemp             9.432e-01  4.847e-01   1.946 0.052468 .  
## pcincome          9.798e-04  6.151e-04   1.593 0.112068    
## region2           1.079e+01  2.573e+00   4.194 3.47e-05 ***
## region3           2.769e+01  2.641e+00  10.485  < 2e-16 ***
## region4           2.132e+01  3.033e+00   7.028 1.08e-11 ***
## beds_rate_1000    1.988e+00  7.238e-01   2.746 0.006339 ** 
## pop:pcincome     -2.457e-09  6.640e-10  -3.700 0.000249 ***
## poverty:pcincome  2.957e-04  6.766e-05   4.371 1.63e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.87 on 355 degrees of freedom
## Multiple R-squared:  0.5802, Adjusted R-squared:  0.5672 
## F-statistic: 44.61 on 11 and 355 DF,  p-value: < 2.2e-16
check_collinearity(fit_int1)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##              Term  VIF Increased SE Tolerance
##               pop 1.00         1.00      1.00
##             pop18 1.16         1.08      0.86
##           poverty 1.17         1.08      0.86
##             unemp 1.47         1.21      0.68
##          pcincome 1.09         1.04      0.92
##            region 1.47         1.21      0.68
##    beds_rate_1000 1.39         1.18      0.72
##      pop:pcincome 1.00         1.00      1.00
##  poverty:pcincome 1.00         1.00      1.00

diagnose

par(mfrow = c(2,2))
plot(fit_int1)
\label{fig:figs}Diagnose plots with interaction terms:poverty*pcincome

Diagnose plots with interaction terms:poverty*pcincome

boxcox(fit_int1)
\label{fig:figs}Boxcox plot with interaction terms:poverty*pcincome

Boxcox plot with interaction terms:poverty*pcincome

The peak of boxcox plot is close to around 0.5~1. Try \(\sqrt{y}\) transformation

transformation

cdi_model_trans = cdi_model %>% 
  mutate(
    y_sqrt = sqrt(crime_rate_1000)
  )

fit_int1_trans = lm(y_sqrt ~  
                   pop + pop18 + 
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000  +
                  poverty*pcincome, data = cdi_model_trans)
summary(fit_int1_trans)
## 
## Call:
## lm(formula = y_sqrt ~ pop + pop18 + poverty + unemp + pcincome + 
##     pcincome * pop + region + beds_rate_1000 + poverty * pcincome, 
##     data = cdi_model_trans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9383 -0.5569  0.0166  0.6563  3.7968 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       9.271e-01  1.078e+00   0.860 0.390234    
## pop               3.748e-06  9.476e-07   3.955 9.24e-05 ***
## pop18             5.881e-02  1.596e-02   3.686 0.000264 ***
## poverty          -1.882e-01  7.525e-02  -2.500 0.012857 *  
## unemp             7.489e-02  3.331e-02   2.248 0.025191 *  
## pcincome          5.891e-05  4.228e-05   1.394 0.164333    
## region2           7.833e-01  1.768e-01   4.430 1.26e-05 ***
## region3           1.970e+00  1.815e-01  10.855  < 2e-16 ***
## region4           1.644e+00  2.085e-01   7.888 3.84e-14 ***
## beds_rate_1000    1.532e-01  4.975e-02   3.079 0.002236 ** 
## pop:pcincome     -1.504e-10  4.563e-11  -3.295 0.001084 ** 
## poverty:pcincome  1.876e-05  4.650e-06   4.034 6.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.091 on 355 degrees of freedom
## Multiple R-squared:  0.5659, Adjusted R-squared:  0.5524 
## F-statistic: 42.07 on 11 and 355 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit_int1_trans)
\label{fig:figs}Diagnose plots with interaction terms:poverty*pcincome

Diagnose plots with interaction terms:poverty*pcincome

Compare to the diagnose plots of untransformed model, we found that the residuals are more unevenly distributed. Therefore, transformed model is worse. We select the untransformed model.

Our second model: \[crime\_rate\_1000 = pop + pop18 + poverty + unemp + pcincome + \\ pcincome*pop + region + beds\_rate\_1000 + poverty*pcincome\]

Add interaction term: pcincome + bagrad

According to Census Bureau, national percent of persons 25 years old or older with bachelor’s degrees is 20.8%. Thus, we can use this criteria to divide bagrad into two category: higher than national bagrad and lower than national bargrad.

bagrad_status = cdi_model %>% 
  mutate(national_bagrad = if_else(bagrad > 20.8, "higher", "lower"))

ggplot(bagrad_status, aes(x = pcincome, y = crime_rate_1000, color = national_bagrad)) + 
  geom_point(alpha = .5) +
  geom_smooth(method = "lm", se = F, aes(group = national_bagrad, color = national_bagrad)) +
  ylim(0,150) +
  labs(
    title = "Crime Rate and Per Capita Income by Percent Bachelor's Degrees Status",
    x = "Per Capita Income",
    y = "Crime Rate",
    color = "Comparison with national avergae"
  )
\label{fig:figs}Interaction plot of Income Per Capita and Bachelor's Degree Status

Interaction plot of Income Per Capita and Bachelor’s Degree Status

fit_int2 = lm(crime_rate_1000 ~  
                  pop + pop18 + bagrad +
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000 + density +
                  pcincome*bagrad, data = cdi_model)
summary(fit_int2)
## 
## Call:
## lm(formula = crime_rate_1000 ~ pop + pop18 + bagrad + poverty + 
##     unemp + pcincome + pcincome * pop + region + beds_rate_1000 + 
##     density + pcincome * bagrad, data = cdi_model)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -42.88  -9.84  -0.85   8.19  61.74 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.141e+02  1.963e+01  -5.814 1.37e-08 ***
## pop              6.518e-05  1.453e-05   4.487 9.79e-06 ***
## pop18            1.048e+00  3.282e-01   3.193 0.001537 ** 
## bagrad           1.252e+00  6.865e-01   1.824 0.069046 .  
## poverty          1.906e+00  3.995e-01   4.770 2.69e-06 ***
## unemp            9.002e-01  5.304e-01   1.697 0.090555 .  
## pcincome         5.012e-03  9.351e-04   5.360 1.51e-07 ***
## region2          1.244e+01  2.603e+00   4.778 2.60e-06 ***
## region3          2.979e+01  2.662e+00  11.191  < 2e-16 ***
## region4          2.432e+01  3.107e+00   7.827 5.87e-14 ***
## beds_rate_1000   2.740e+00  6.944e-01   3.946 9.58e-05 ***
## density          1.007e-03  6.710e-04   1.500 0.134491    
## pop:pcincome    -2.706e-09  7.018e-10  -3.856 0.000137 ***
## bagrad:pcincome -8.009e-05  3.161e-05  -2.534 0.011723 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.09 on 353 degrees of freedom
## Multiple R-squared:  0.5708, Adjusted R-squared:  0.555 
## F-statistic: 36.11 on 13 and 353 DF,  p-value: < 2.2e-16
check_collinearity(fit_int2)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##             Term  VIF Increased SE Tolerance
##              pop 1.00         1.00      1.00
##            pop18 1.38         1.17      0.72
##           bagrad 1.63         1.27      0.62
##          poverty 1.96         1.40      0.51
##            unemp 1.46         1.21      0.69
##         pcincome 1.14         1.07      0.87
##           region 1.51         1.23      0.66
##   beds_rate_1000 1.82         1.35      0.55
##          density 1.02         1.01      0.98
##     pop:pcincome 1.00         1.00      1.00
##  bagrad:pcincome 1.00         1.00      1.00

diagnose

par(mfrow = c(2,2))
plot(fit_int2)
\label{fig:figs}Diagnose plots with interaction terms:pcincome*bagrad

Diagnose plots with interaction terms:pcincome*bagrad

boxcox(fit_int2)
\label{fig:figs}Boxcox plot with interaction terms:pcincome*bagrad

Boxcox plot with interaction terms:pcincome*bagrad

The peak of boxcox plot is close to around 0.5~1. Try \(\sqrt{y}\) transformation

transformation

fit_int2_trans = lm(y_sqrt ~  
                  pop + pop18 + bagrad +
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000 + density +
                  pcincome*bagrad, data = cdi_model_trans)
summary(fit_int2_trans)
## 
## Call:
## lm(formula = y_sqrt ~ pop + pop18 + bagrad + poverty + unemp + 
##     pcincome + pcincome * pop + region + beds_rate_1000 + density + 
##     pcincome * bagrad, data = cdi_model_trans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9297 -0.6537 -0.0077  0.6205  3.6242 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -4.111e+00  1.342e+00  -3.064 0.002351 ** 
## pop              3.926e-06  9.930e-07   3.954 9.28e-05 ***
## pop18            6.725e-02  2.244e-02   2.998 0.002914 ** 
## bagrad           9.341e-02  4.693e-02   1.990 0.047307 *  
## poverty          1.119e-01  2.731e-02   4.098 5.18e-05 ***
## unemp            7.105e-02  3.626e-02   1.960 0.050833 .  
## pcincome         3.396e-04  6.392e-05   5.314 1.91e-07 ***
## region2          8.899e-01  1.779e-01   5.002 8.98e-07 ***
## region3          2.107e+00  1.820e-01  11.577  < 2e-16 ***
## region4          1.840e+00  2.123e-01   8.666  < 2e-16 ***
## beds_rate_1000   1.968e-01  4.746e-02   4.147 4.22e-05 ***
## density          6.492e-05  4.587e-05   1.415 0.157840    
## pop:pcincome    -1.595e-10  4.797e-11  -3.325 0.000978 ***
## bagrad:pcincome -5.950e-06  2.161e-06  -2.753 0.006202 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.1 on 353 degrees of freedom
## Multiple R-squared:  0.561,  Adjusted R-squared:  0.5448 
## F-statistic: 34.69 on 13 and 353 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit_int2_trans)
\label{fig:figs}Diagnose plots with interaction terms:pcincome*bagrad

Diagnose plots with interaction terms:pcincome*bagrad

Compare to the diagnose plots of untransformed model, we found that the residuals are more unevenly distributed. Therefore, transformed model is worse. We select the untransformed model.

Our third model: \[crime\_rate\_1000 = pop + pop18 + bagrad + poverty + unemp + pcincome + \\ pcincome*pop + region + beds\_rate\_1000 + density + pcincome*bagrad\]
## Cross validation

model 1

set.seed(1)
train = trainControl(method = "cv", number = 5)

model_train1 = train(crime_rate_1000 ~  
                  pop + pop18 + bagrad +
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000 + density,data = cdi_model,
                   trControl = train,
                   method = 'lm',
                   na.action = na.pass)
print(model_train1)
## Linear Regression 
## 
## 367 samples
##   9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 293, 294, 294, 294, 293 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   16.60425  0.5343078  12.52969
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model 2

set.seed(1)
train = trainControl(method = "cv", number = 5)

model_train2 = train(crime_rate_1000 ~  
                   pop + pop18 + 
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000  +
                  poverty*pcincome, data = cdi_model,
                   trControl = train,
                   method = 'lm',
                   na.action = na.pass)

summary(model_train2)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.933  -8.984  -0.825   9.062  64.974 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -4.143e+01  1.568e+01  -2.642 0.008603 ** 
## pop                 6.001e-05  1.379e-05   4.352 1.76e-05 ***
## pop18               9.277e-01  2.321e-01   3.996 7.84e-05 ***
## poverty            -2.777e+00  1.095e+00  -2.536 0.011647 *  
## unemp               9.432e-01  4.847e-01   1.946 0.052468 .  
## pcincome            9.798e-04  6.151e-04   1.593 0.112068    
## region2             1.079e+01  2.573e+00   4.194 3.47e-05 ***
## region3             2.769e+01  2.641e+00  10.485  < 2e-16 ***
## region4             2.132e+01  3.033e+00   7.028 1.08e-11 ***
## beds_rate_1000      1.988e+00  7.238e-01   2.746 0.006339 ** 
## `pop:pcincome`     -2.457e-09  6.640e-10  -3.700 0.000249 ***
## `poverty:pcincome`  2.957e-04  6.766e-05   4.371 1.63e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.87 on 355 degrees of freedom
## Multiple R-squared:  0.5802, Adjusted R-squared:  0.5672 
## F-statistic: 44.61 on 11 and 355 DF,  p-value: < 2.2e-16

model 3

set.seed(1)
train = trainControl(method = "cv", number = 5)

model_train3 = train(crime_rate_1000 ~  
                  pop + pop18 + bagrad +
                  poverty + unemp + pcincome + pcincome*pop + region +
                  beds_rate_1000 + density +
                  pcincome*bagrad,  data = cdi_model,
                   trControl = train,
                   method = 'lm',
                   na.action = na.pass)
summary(model_train3)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -42.88  -9.84  -0.85   8.19  61.74 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.141e+02  1.963e+01  -5.814 1.37e-08 ***
## pop                6.518e-05  1.453e-05   4.487 9.79e-06 ***
## pop18              1.048e+00  3.282e-01   3.193 0.001537 ** 
## bagrad             1.252e+00  6.865e-01   1.824 0.069046 .  
## poverty            1.906e+00  3.995e-01   4.770 2.69e-06 ***
## unemp              9.002e-01  5.304e-01   1.697 0.090555 .  
## pcincome           5.012e-03  9.351e-04   5.360 1.51e-07 ***
## region2            1.244e+01  2.603e+00   4.778 2.60e-06 ***
## region3            2.979e+01  2.662e+00  11.191  < 2e-16 ***
## region4            2.432e+01  3.107e+00   7.827 5.87e-14 ***
## beds_rate_1000     2.740e+00  6.944e-01   3.946 9.58e-05 ***
## density            1.007e-03  6.710e-04   1.500 0.134491    
## `pop:pcincome`    -2.706e-09  7.018e-10  -3.856 0.000137 ***
## `bagrad:pcincome` -8.009e-05  3.161e-05  -2.534 0.011723 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.09 on 353 degrees of freedom
## Multiple R-squared:  0.5708, Adjusted R-squared:  0.555 
## F-statistic: 36.11 on 13 and 353 DF,  p-value: < 2.2e-16

Compare RMSE

model <- c("1", "2", "3")

RMSE <- c(round(model_train1$results$RMSE, 2),  round(model_train2$results$RMSE,2),
          round(model_train3$results$RMSE, 2))

R_sq <- c(round(model_train1$results$Rsquared, 3),
          round(model_train2$results$Rsquared, 3),
          round(model_train3$results$Rsquared, 3))

RMSE_table <- data.frame(model, RMSE, R_sq)

coefs_1 = model_train1$finalModel$coefficients
names_1 = model_train1$finalModel$xNames

knitr::kable(RMSE_table, caption = "RMSE table for three models")
RMSE table for three models
model RMSE R_sq
1 16.60 0.534
2 16.20 0.561
3 16.46 0.542

The second model has the lowest RMSE.

Model Assessment on testing set

test_data = test_data %>%
  mutate(
    y = crime_rate_1000,
    y_model_1 = predict(model_train1,test_data),
    y_model_2 = predict(model_train2,test_data),
    y_model_3 = predict(model_train3,test_data))

RMSPE_1 = sqrt(mean((test_data$y-test_data$y_model_1)^2))
RMSPE_2 = sqrt(mean((test_data$y-test_data$y_model_2)^2))
RMSPE_3 = sqrt(mean((test_data$y-test_data$y_model_3)^2))



model_assessment = 
  tibble(
    RMSPE_1 = round(RMSPE_1,2),
    RMSPE_2 = round(RMSPE_2,2),
    RMSPE_3 = round(RMSPE_3,2)) %>% 
  pivot_longer(RMSPE_1:RMSPE_3,
               names_to = "model", 
               names_prefix = "RMSPE_",
               values_to = "RMSPE") %>%
  left_join(RMSE_table,by="model") %>%
  dplyr::select(Model=model,R_square = R_sq,RMSE,RMSPE)

knitr::kable(model_assessment, caption = "Model assessment table", booktabs = TRUE)
Model assessment table
Model R_square RMSE RMSPE
1 0.534 16.60 12.06
2 0.561 16.20 12.32
3 0.542 16.46 11.90